In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
df = pd.read_csv('athlete_events.csv')
region_df = pd.read_csv('noc_regions.csv')
In [3]:
df.tail()
Out[3]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal
271111 135569 Andrzej ya M 29.0 179.0 89.0 Poland-1 POL 1976 Winter 1976 Winter Innsbruck Luge Luge Mixed (Men)'s Doubles NaN
271112 135570 Piotr ya M 27.0 176.0 59.0 Poland POL 2014 Winter 2014 Winter Sochi Ski Jumping Ski Jumping Men's Large Hill, Individual NaN
271113 135570 Piotr ya M 27.0 176.0 59.0 Poland POL 2014 Winter 2014 Winter Sochi Ski Jumping Ski Jumping Men's Large Hill, Team NaN
271114 135571 Tomasz Ireneusz ya M 30.0 185.0 96.0 Poland POL 1998 Winter 1998 Winter Nagano Bobsleigh Bobsleigh Men's Four NaN
271115 135571 Tomasz Ireneusz ya M 34.0 185.0 96.0 Poland POL 2002 Winter 2002 Winter Salt Lake City Bobsleigh Bobsleigh Men's Four NaN
In [4]:
df.shape
Out[4]:
(271116, 15)
In [5]:
#To FILTER summer season data only
df = df[df['Season']=='Summer']
In [6]:
df.shape
Out[6]:
(222552, 15)
In [7]:
df.tail()
Out[7]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal
271106 135565 Fernando scar Zylberberg M 27.0 168.0 76.0 Argentina ARG 2004 Summer 2004 Summer Athina Hockey Hockey Men's Hockey NaN
271107 135566 James Francis "Jim" Zylker M 21.0 175.0 75.0 United States USA 1972 Summer 1972 Summer Munich Football Football Men's Football NaN
271108 135567 Aleksandr Viktorovich Zyuzin M 24.0 183.0 72.0 Russia RUS 2000 Summer 2000 Summer Sydney Rowing Rowing Men's Lightweight Coxless Fours NaN
271109 135567 Aleksandr Viktorovich Zyuzin M 28.0 183.0 72.0 Russia RUS 2004 Summer 2004 Summer Athina Rowing Rowing Men's Lightweight Coxless Fours NaN
271110 135568 Olga Igorevna Zyuzkova F 33.0 171.0 69.0 Belarus BLR 2016 Summer 2016 Summer Rio de Janeiro Basketball Basketball Women's Basketball NaN
In [8]:
region_df.tail()
Out[8]:
NOC region notes
225 YEM Yemen NaN
226 YMD Yemen South Yemen
227 YUG Serbia Yugoslavia
228 ZAM Zambia NaN
229 ZIM Zimbabwe NaN
In [9]:
#to LEFT JOIN both tables
df = df.merge(region_df,on = 'NOC',how = 'left')
In [10]:
df.tail()
Out[10]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal region notes
222547 135565 Fernando scar Zylberberg M 27.0 168.0 76.0 Argentina ARG 2004 Summer 2004 Summer Athina Hockey Hockey Men's Hockey NaN Argentina NaN
222548 135566 James Francis "Jim" Zylker M 21.0 175.0 75.0 United States USA 1972 Summer 1972 Summer Munich Football Football Men's Football NaN USA NaN
222549 135567 Aleksandr Viktorovich Zyuzin M 24.0 183.0 72.0 Russia RUS 2000 Summer 2000 Summer Sydney Rowing Rowing Men's Lightweight Coxless Fours NaN Russia NaN
222550 135567 Aleksandr Viktorovich Zyuzin M 28.0 183.0 72.0 Russia RUS 2004 Summer 2004 Summer Athina Rowing Rowing Men's Lightweight Coxless Fours NaN Russia NaN
222551 135568 Olga Igorevna Zyuzkova F 33.0 171.0 69.0 Belarus BLR 2016 Summer 2016 Summer Rio de Janeiro Basketball Basketball Women's Basketball NaN Belarus NaN
In [11]:
#how many countries participated
df['region'].unique()
Out[11]:
array(['China', 'Denmark', 'Netherlands', 'Finland', 'Norway', 'Romania',
       'Estonia', 'France', 'Morocco', 'Spain', 'Egypt', 'Iran',
       'Bulgaria', 'Italy', 'Chad', 'Azerbaijan', 'Sudan', 'Russia',
       'Argentina', 'Cuba', 'Belarus', 'Greece', 'Cameroon', 'Turkey',
       'Chile', 'Mexico', 'USA', 'Nicaragua', 'Hungary', 'Nigeria',
       'Algeria', 'Kuwait', 'Bahrain', 'Pakistan', 'Iraq', 'Syria',
       'Lebanon', 'Qatar', 'Malaysia', 'Germany', 'Canada', 'Ireland',
       'Australia', 'South Africa', 'Eritrea', 'Tanzania', 'Jordan',
       'Tunisia', 'Libya', 'Belgium', 'Djibouti', 'Palestine', 'Comoros',
       'Kazakhstan', 'Brunei', 'India', 'Saudi Arabia', 'Maldives',
       'Ethiopia', 'United Arab Emirates', 'Yemen', 'Indonesia',
       'Philippines', nan, 'Uzbekistan', 'Kyrgyzstan', 'Tajikistan',
       'Japan', 'Republic of Congo', 'Switzerland', 'Brazil', 'Monaco',
       'Israel', 'Uruguay', 'Sweden', 'Sri Lanka', 'Armenia',
       'Ivory Coast', 'Kenya', 'Benin', 'UK', 'Ghana', 'Somalia', 'Niger',
       'Mali', 'Afghanistan', 'Poland', 'Costa Rica', 'Panama', 'Georgia',
       'Slovenia', 'Guyana', 'New Zealand', 'Portugal', 'Paraguay',
       'Angola', 'Venezuela', 'Colombia', 'Bangladesh', 'Peru',
       'El Salvador', 'Puerto Rico', 'Uganda', 'Honduras', 'Ecuador',
       'Turkmenistan', 'Mauritius', 'Seychelles', 'Czech Republic',
       'Luxembourg', 'Mauritania', 'Saint Kitts', 'Trinidad',
       'Dominican Republic', 'Saint Vincent', 'Jamaica', 'Liberia',
       'Suriname', 'Nepal', 'Mongolia', 'Austria', 'Palau', 'Lithuania',
       'Togo', 'Namibia', 'Curacao', 'Ukraine', 'Iceland',
       'American Samoa', 'Samoa', 'Rwanda', 'Croatia', 'Dominica',
       'Haiti', 'Malta', 'Cyprus', 'Guinea', 'Belize', 'Thailand',
       'Bermuda', 'Serbia', 'Sierra Leone', 'Papua New Guinea',
       'Individual Olympic Athletes', 'Oman', 'Fiji', 'Vanuatu',
       'Moldova', 'Bahamas', 'Guatemala', 'Latvia',
       'Virgin Islands, British', 'Mozambique', 'Virgin Islands, US',
       'Central African Republic', 'Madagascar', 'Bosnia and Herzegovina',
       'Guam', 'Cayman Islands', 'Slovakia', 'Barbados', 'Guinea-Bissau',
       'Timor-Leste', 'Democratic Republic of the Congo', 'Gabon',
       'San Marino', 'Laos', 'Botswana', 'South Korea', 'Cambodia',
       'North Korea', 'Solomon Islands', 'Senegal', 'Cape Verde',
       'Equatorial Guinea', 'Boliva', 'Antigua', 'Andorra', 'Zimbabwe',
       'Grenada', 'Saint Lucia', 'Micronesia', 'Myanmar', 'Malawi',
       'Zambia', 'Taiwan', 'Sao Tome and Principe', 'Macedonia',
       'Liechtenstein', 'Montenegro', 'Gambia', 'Cook Islands', 'Albania',
       'Swaziland', 'Burkina Faso', 'Burundi', 'Aruba', 'Nauru',
       'Vietnam', 'Bhutan', 'Marshall Islands', 'Kiribati', 'Tonga',
       'Kosovo', 'South Sudan', 'Lesotho'], dtype=object)
In [12]:
df['region'].unique().shape
Out[12]:
(206,)
In [13]:
df.isnull().sum()
Out[13]:
ID             0
Name           0
Sex            0
Age         9189
Height     51857
Weight     53854
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     188464
region       370
notes     218151
dtype: int64
In [14]:
df.duplicated().sum()
Out[14]:
1385
In [15]:
df.drop_duplicates(inplace = True)
In [16]:
df.duplicated().sum()
Out[16]:
0
In [17]:
df['Medal'].value_counts()
Out[17]:
Gold      11456
Bronze    11409
Silver    11212
Name: Medal, dtype: int64
In [18]:
pd.get_dummies(df['Medal'])
Out[18]:
Bronze Gold Silver
0 0 0 0
1 0 0 0
2 0 0 0
3 0 1 0
4 0 0 0
... ... ... ...
222547 0 0 0
222548 0 0 0
222549 0 0 0
222550 0 0 0
222551 0 0 0

221167 rows × 3 columns

In [19]:
df = pd.concat([df,pd.get_dummies(df['Medal'])],axis = 1) #concat horizontally
In [20]:
df.groupby('NOC').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
Out[20]:
NOC Gold Silver Bronze
0 USA 2472.0 1333.0 1197.0
1 URS 832.0 635.0 596.0
2 GBR 635.0 729.0 620.0
3 GER 592.0 538.0 649.0
4 ITA 518.0 474.0 454.0
... ... ... ... ...
225 AHO 0.0 1.0 0.0
226 LBR 0.0 0.0 0.0
227 LCA 0.0 0.0 0.0
228 LES 0.0 0.0 0.0
229 LBA 0.0 0.0 0.0

230 rows × 4 columns

In [21]:
df.tail()
Out[21]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal region notes Bronze Gold Silver
222547 135565 Fernando scar Zylberberg M 27.0 168.0 76.0 Argentina ARG 2004 Summer 2004 Summer Athina Hockey Hockey Men's Hockey NaN Argentina NaN 0 0 0
222548 135566 James Francis "Jim" Zylker M 21.0 175.0 75.0 United States USA 1972 Summer 1972 Summer Munich Football Football Men's Football NaN USA NaN 0 0 0
222549 135567 Aleksandr Viktorovich Zyuzin M 24.0 183.0 72.0 Russia RUS 2000 Summer 2000 Summer Sydney Rowing Rowing Men's Lightweight Coxless Fours NaN Russia NaN 0 0 0
222550 135567 Aleksandr Viktorovich Zyuzin M 28.0 183.0 72.0 Russia RUS 2004 Summer 2004 Summer Athina Rowing Rowing Men's Lightweight Coxless Fours NaN Russia NaN 0 0 0
222551 135568 Olga Igorevna Zyuzkova F 33.0 171.0 69.0 Belarus BLR 2016 Summer 2016 Summer Rio de Janeiro Basketball Basketball Women's Basketball NaN Belarus NaN 0 0 0
In [22]:
#to get rows where India won gold medal
df[(df['NOC']=='IND')&(df['Medal']=='Gold')].shape
Out[22]:
(131, 20)
In [67]:
#so drop duplicates to count 1 medal for 1 team instead of each player
medal_tally = df.drop_duplicates(subset=['NOC','Games','Sport','Event','Medal'])
medal_tally.tail()
Out[67]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal region notes Bronze Gold Silver
222528 135553 Galina Ivanovna Zybina (-Fyodorova) F 25.0 168.0 80.0 Soviet Union URS 1956 Summer 1956 Summer Melbourne Athletics Athletics Women's Shot Put Silver Russia NaN 0 0 1
222530 135553 Galina Ivanovna Zybina (-Fyodorova) F 33.0 168.0 80.0 Soviet Union URS 1964 Summer 1964 Summer Tokyo Athletics Athletics Women's Shot Put Bronze Russia NaN 1 0 0
222536 135556 Bogusaw Stanisaw Zychowicz M 19.0 189.0 80.0 Poland POL 1980 Summer 1980 Summer Moskva Swimming Swimming Men's 100 metres Butterfly NaN Poland NaN 0 0 0
222537 135556 Bogusaw Stanisaw Zychowicz M 19.0 189.0 80.0 Poland POL 1980 Summer 1980 Summer Moskva Swimming Swimming Men's 200 metres Butterfly NaN Poland NaN 0 0 0
222541 135560 Stavroula Zygouri F 36.0 171.0 63.0 Greece GRE 2004 Summer 2004 Summer Athina Wrestling Wrestling Women's Middleweight, Freestyle NaN Greece NaN 0 0 0
In [68]:
medal_tally = medal_tally.groupby('NOC').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
medal_tally.head()
Out[68]:
NOC Gold Silver Bronze
0 USA 1035.0 802.0 707.0
1 URS 394.0 317.0 294.0
2 GBR 278.0 316.0 298.0
3 GER 233.0 261.0 282.0
4 FRA 233.0 255.0 282.0
In [25]:
medal_tally[medal_tally['NOC']=='IND']
Out[25]:
NOC Gold Silver Bronze
53 IND 9.0 7.0 12.0
In [26]:
 medal_tally['total'] = medal_tally['Gold']+medal_tally['Silver']+medal_tally['Bronze']
In [27]:
medal_tally.head()
Out[27]:
NOC Gold Silver Bronze total
0 USA 1035.0 802.0 707.0 2544.0
1 URS 394.0 317.0 294.0 1005.0
2 GBR 278.0 316.0 298.0 892.0
3 GER 233.0 261.0 282.0 776.0
4 FRA 233.0 255.0 282.0 770.0
In [28]:
years = df['Year'].unique().tolist()
In [29]:
years.sort()
In [30]:
years
Out[30]:
[1896,
 1900,
 1904,
 1906,
 1908,
 1912,
 1920,
 1924,
 1928,
 1932,
 1936,
 1948,
 1952,
 1956,
 1960,
 1964,
 1968,
 1972,
 1976,
 1980,
 1984,
 1988,
 1992,
 1996,
 2000,
 2004,
 2008,
 2012,
 2016]
In [31]:
years.insert(0,'Overall')
In [32]:
years
Out[32]:
['Overall',
 1896,
 1900,
 1904,
 1906,
 1908,
 1912,
 1920,
 1924,
 1928,
 1932,
 1936,
 1948,
 1952,
 1956,
 1960,
 1964,
 1968,
 1972,
 1976,
 1980,
 1984,
 1988,
 1992,
 1996,
 2000,
 2004,
 2008,
 2012,
 2016]
In [33]:
country=df['region'].dropna().unique().tolist()
In [34]:
country.sort()
In [35]:
country
Out[35]:
['Afghanistan',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Boliva',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czech Republic',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guam',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Individual Olympic Athletes',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Ivory Coast',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kosovo',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macedonia',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Malta',
 'Marshall Islands',
 'Mauritania',
 'Mauritius',
 'Mexico',
 'Micronesia',
 'Moldova',
 'Monaco',
 'Mongolia',
 'Montenegro',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'North Korea',
 'Norway',
 'Oman',
 'Pakistan',
 'Palau',
 'Palestine',
 'Panama',
 'Papua New Guinea',
 'Paraguay',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Republic of Congo',
 'Romania',
 'Russia',
 'Rwanda',
 'Saint Kitts',
 'Saint Lucia',
 'Saint Vincent',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Seychelles',
 'Sierra Leone',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'South Africa',
 'South Korea',
 'South Sudan',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Swaziland',
 'Sweden',
 'Switzerland',
 'Syria',
 'Taiwan',
 'Tajikistan',
 'Tanzania',
 'Thailand',
 'Timor-Leste',
 'Togo',
 'Tonga',
 'Trinidad',
 'Tunisia',
 'Turkey',
 'Turkmenistan',
 'UK',
 'USA',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela',
 'Vietnam',
 'Virgin Islands, British',
 'Virgin Islands, US',
 'Yemen',
 'Zambia',
 'Zimbabwe']
In [36]:
country.insert(0,'Overall')
In [37]:
medal_tally1 = df.drop_duplicates(subset=['NOC','Games','Sport','Event','Medal'])
In [38]:
country
Out[38]:
['Overall',
 'Afghanistan',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Boliva',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czech Republic',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guam',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Individual Olympic Athletes',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Ivory Coast',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kosovo',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macedonia',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Malta',
 'Marshall Islands',
 'Mauritania',
 'Mauritius',
 'Mexico',
 'Micronesia',
 'Moldova',
 'Monaco',
 'Mongolia',
 'Montenegro',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'North Korea',
 'Norway',
 'Oman',
 'Pakistan',
 'Palau',
 'Palestine',
 'Panama',
 'Papua New Guinea',
 'Paraguay',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Republic of Congo',
 'Romania',
 'Russia',
 'Rwanda',
 'Saint Kitts',
 'Saint Lucia',
 'Saint Vincent',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Seychelles',
 'Sierra Leone',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'South Africa',
 'South Korea',
 'South Sudan',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Swaziland',
 'Sweden',
 'Switzerland',
 'Syria',
 'Taiwan',
 'Tajikistan',
 'Tanzania',
 'Thailand',
 'Timor-Leste',
 'Togo',
 'Tonga',
 'Trinidad',
 'Tunisia',
 'Turkey',
 'Turkmenistan',
 'UK',
 'USA',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela',
 'Vietnam',
 'Virgin Islands, British',
 'Virgin Islands, US',
 'Yemen',
 'Zambia',
 'Zimbabwe']
In [39]:
def fetch_medal_tally(df,year,country):
    medal_tally1 = df.drop_duplicates(subset=['NOC','Games','Sport','Event','Medal'])
    flag = 0
    if year == 'Overall' and country == 'Overall':
        temp_df = medal_tally1
    if year == 'Overall' and country != 'Overall':
        flag = 1
        temp_df = medal_tally1[medal_tally1['region']==country]
    if year != 'Overall' and country == 'Overall':
        temp_df = medal_tally1[medal_tally1['Year']==year]    
    if year != 'Overall' and country != 'Overall':
        temp_df = medal_tally1[(medal_tally1['region']==country) & (medal_tally1['Year']==int(year))]
    if(flag==1): #we want to check performance of a country in every year olympic
        x = temp_df.groupby('Year').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
    else:
        x = temp_df.groupby('region').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
    x['Total']=x['Gold']+x['Silver']+x['Bronze']
    return x   
In [40]:
fetch_medal_tally(df,1992,'Overall')
Out[40]:
region Gold Silver Bronze Total
0 Russia 45 38 29 112
1 USA 37 34 37 108
2 Germany 33 21 28 82
3 China 16 22 15 53
4 Cuba 14 6 11 31
... ... ... ... ... ...
162 Haiti 0 0 0 0
163 Honduras 0 0 0 0
164 Iceland 0 0 0 0
165 India 0 0 0 0
166 Zimbabwe 0 0 0 0

167 rows × 5 columns

In [41]:
medal_tally1
Out[41]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal region notes Bronze Gold Silver
0 1 A Dijiang M 24.0 180.0 80.0 China CHN 1992 Summer 1992 Summer Barcelona Basketball Basketball Men's Basketball NaN China NaN 0 0 0
1 2 A Lamusi M 23.0 170.0 60.0 China CHN 2012 Summer 2012 Summer London Judo Judo Men's Extra-Lightweight NaN China NaN 0 0 0
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN 1920 Summer 1920 Summer Antwerpen Football Football Men's Football NaN Denmark NaN 0 0 0
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN 1900 Summer 1900 Summer Paris Tug-Of-War Tug-Of-War Men's Tug-Of-War Gold Denmark NaN 0 1 0
4 8 Cornelia "Cor" Aalten (-Strannood) F 18.0 168.0 NaN Netherlands NED 1932 Summer 1932 Summer Los Angeles Athletics Athletics Women's 100 metres NaN Netherlands NaN 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
222528 135553 Galina Ivanovna Zybina (-Fyodorova) F 25.0 168.0 80.0 Soviet Union URS 1956 Summer 1956 Summer Melbourne Athletics Athletics Women's Shot Put Silver Russia NaN 0 0 1
222530 135553 Galina Ivanovna Zybina (-Fyodorova) F 33.0 168.0 80.0 Soviet Union URS 1964 Summer 1964 Summer Tokyo Athletics Athletics Women's Shot Put Bronze Russia NaN 1 0 0
222536 135556 Bogusaw Stanisaw Zychowicz M 19.0 189.0 80.0 Poland POL 1980 Summer 1980 Summer Moskva Swimming Swimming Men's 100 metres Butterfly NaN Poland NaN 0 0 0
222537 135556 Bogusaw Stanisaw Zychowicz M 19.0 189.0 80.0 Poland POL 1980 Summer 1980 Summer Moskva Swimming Swimming Men's 200 metres Butterfly NaN Poland NaN 0 0 0
222541 135560 Stavroula Zygouri F 36.0 171.0 63.0 Greece GRE 2004 Summer 2004 Summer Athina Wrestling Wrestling Women's Middleweight, Freestyle NaN Greece NaN 0 0 0

105168 rows × 20 columns

In [42]:
fetch_medal_tally(df,'Overall','Norway')
Out[42]:
Year Gold Silver Bronze Total
0 1920 10 10 9 29
1 1924 5 2 3 10
2 2004 5 0 1 6
3 2000 4 3 3 10
4 1912 4 1 5 10
5 1952 3 2 0 5
6 2008 3 5 1 9
7 1992 2 4 1 7
8 1988 2 3 0 5
9 1972 2 1 1 4
10 1904 2 0 0 2
11 1996 2 2 3 7
12 2012 2 1 1 4
13 1908 2 3 3 8
14 1956 1 0 2 3
15 1948 1 3 3 7
16 1936 1 3 2 6
17 1968 1 1 0 2
18 1976 1 1 0 2
19 1928 1 2 1 4
20 1906 1 1 0 2
21 1960 1 0 0 1
22 1900 0 2 3 5
23 1984 0 1 2 3
24 1964 0 0 0 0
25 1932 0 0 0 0
26 2016 0 0 4 4
In [43]:
fetch_medal_tally(df,'Overall','Overall')
Out[43]:
region Gold Silver Bronze Total
0 USA 1035.0 802.0 707.0 2544.0
1 Russia 592.0 498.0 487.0 1577.0
2 Germany 442.0 457.0 490.0 1389.0
3 UK 278.0 316.0 298.0 892.0
4 France 233.0 255.0 282.0 770.0
... ... ... ... ... ...
200 Lesotho 0.0 0.0 0.0 0.0
201 Albania 0.0 0.0 0.0 0.0
202 Libya 0.0 0.0 0.0 0.0
203 Liechtenstein 0.0 0.0 0.0 0.0
204 Liberia 0.0 0.0 0.0 0.0

205 rows × 5 columns

In [44]:
 fetch_medal_tally(df,2016,'USA')
Out[44]:
region Gold Silver Bronze Total
0 USA 46 37 38 121
In [45]:
#Overall Analysis
#No. of editions, cities, events/sports, athletes, participating nations
In [46]:
df.head()
Out[46]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal region notes Bronze Gold Silver
0 1 A Dijiang M 24.0 180.0 80.0 China CHN 1992 Summer 1992 Summer Barcelona Basketball Basketball Men's Basketball NaN China NaN 0 0 0
1 2 A Lamusi M 23.0 170.0 60.0 China CHN 2012 Summer 2012 Summer London Judo Judo Men's Extra-Lightweight NaN China NaN 0 0 0
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN 1920 Summer 1920 Summer Antwerpen Football Football Men's Football NaN Denmark NaN 0 0 0
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN 1900 Summer 1900 Summer Paris Tug-Of-War Tug-Of-War Men's Tug-Of-War Gold Denmark NaN 0 1 0
4 8 Cornelia "Cor" Aalten (-Strannood) F 18.0 168.0 NaN Netherlands NED 1932 Summer 1932 Summer Los Angeles Athletics Athletics Women's 100 metres NaN Netherlands NaN 0 0 0
In [47]:
df['Year'].unique().shape[0]-1   #total number of olympics
Out[47]:
28
In [48]:
df['City'].unique().shape
Out[48]:
(23,)
In [49]:
df['Sport'].unique().shape
Out[49]:
(52,)
In [50]:
df['Event'].unique().shape
Out[50]:
(651,)
In [51]:
df['Name'].unique().shape
Out[51]:
(116122,)
In [52]:
df['region'].unique().shape
Out[52]:
(206,)
In [53]:
df.drop_duplicates(subset=['Year','region'])
Out[53]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal region notes Bronze Gold Silver
0 1 A Dijiang M 24.0 180.0 80.0 China CHN 1992 Summer 1992 Summer Barcelona Basketball Basketball Men's Basketball NaN China NaN 0 0 0
1 2 A Lamusi M 23.0 170.0 60.0 China CHN 2012 Summer 2012 Summer London Judo Judo Men's Extra-Lightweight NaN China NaN 0 0 0
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN 1920 Summer 1920 Summer Antwerpen Football Football Men's Football NaN Denmark NaN 0 0 0
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN 1900 Summer 1900 Summer Paris Tug-Of-War Tug-Of-War Men's Tug-Of-War Gold Denmark NaN 0 1 0
4 8 Cornelia "Cor" Aalten (-Strannood) F 18.0 168.0 NaN Netherlands NED 1932 Summer 1932 Summer Los Angeles Athletics Athletics Women's 100 metres NaN Netherlands NaN 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
198805 121320 Horatio Tertuliano Torrom M NaN NaN NaN Argentina ARG 1908 Summer 1908 Summer London Figure Skating Figure Skating Men's Singles NaN Argentina NaN 0 0 0
203075 123858 Lloyd Oscar Valberg M 26.0 NaN NaN Singapore SGP 1948 Summer 1948 Summer London Athletics Athletics Men's High Jump NaN NaN NaN 0 0 0
209763 127894 Bruno Julius Wagner M 25.0 185.0 82.0 Switzerland SUI 1908 Summer 1908 Summer London Athletics Athletics Men's Hammer Throw NaN Switzerland NaN 0 0 0
215352 131186 Antoni Wiwulski M 35.0 NaN NaN Poland POL 1912 Summer 1912 Summer Stockholm Art Competitions Art Competitions Mixed Sculpturing NaN Poland NaN 0 0 0
216735 132030 Mariana Dias Ximenes F 24.0 151.0 48.0 Timor Leste TLS 2008 Summer 2008 Summer Beijing Athletics Athletics Women's Marathon NaN Timor-Leste NaN 0 0 0

2786 rows × 20 columns

In [54]:
nations_overtime = df.drop_duplicates(subset=['Year','region'])['Year'].value_counts().sort_values().reset_index()
In [55]:
nations_overtime.rename(columns = {'index' : 'Edition','Year' : 'Number of Countries'},inplace=True)
In [56]:
#draw line graph
import plotly.express as px
In [57]:
fig = px.line(nations_overtime , x = 'Edition' , y = 'Number of Countries')
fig.show()
In [69]:
#number of events over years
events_overtime = df.drop_duplicates(subset=['Year','Event'])['Year'].value_counts().sort_values().reset_index()
events_overtime.head()
Out[69]:
index Year
0 1896 43
1 1906 74
2 1900 90
3 1904 95
4 1912 107
In [59]:
events_overtime.rename(columns={'index':'Edition','Year':'Number of Events'},inplace=True)
In [60]:
events_overtime
Out[60]:
Edition Number of Events
0 1896 43
1 1906 74
2 1900 90
3 1904 95
4 1912 107
5 1908 109
6 1928 122
7 1932 131
8 1924 131
9 1952 149
10 1936 150
11 1960 150
12 1956 151
13 1948 153
14 1920 158
15 1964 163
16 1968 172
17 1972 193
18 1976 198
19 1980 203
20 1984 221
21 1988 237
22 1992 257
23 1996 271
24 2000 300
25 2004 301
26 2012 302
27 2008 302
28 2016 306
In [61]:
fig2 = px.line(events_overtime,x='Edition',y='Number of Events')
fig2.show()
In [62]:
import seaborn as sns
In [63]:
df.head()
Out[63]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal region notes Bronze Gold Silver
0 1 A Dijiang M 24.0 180.0 80.0 China CHN 1992 Summer 1992 Summer Barcelona Basketball Basketball Men's Basketball NaN China NaN 0 0 0
1 2 A Lamusi M 23.0 170.0 60.0 China CHN 2012 Summer 2012 Summer London Judo Judo Men's Extra-Lightweight NaN China NaN 0 0 0
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN 1920 Summer 1920 Summer Antwerpen Football Football Men's Football NaN Denmark NaN 0 0 0
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN 1900 Summer 1900 Summer Paris Tug-Of-War Tug-Of-War Men's Tug-Of-War Gold Denmark NaN 0 1 0
4 8 Cornelia "Cor" Aalten (-Strannood) F 18.0 168.0 NaN Netherlands NED 1932 Summer 1932 Summer Los Angeles Athletics Athletics Women's 100 metres NaN Netherlands NaN 0 0 0
In [64]:
temp_df = df.dropna(subset=['Medal'])
temp_df['Name'].value_counts().reset_index().merge(df,left_on='index',right_on='Name',how='left')
    
Out[64]:
index Name_x ID Name_y Sex Age Height Weight Team NOC ... Season City Sport Event Medal region notes Bronze Gold Silver
0 Michael Fred Phelps, II 28 94406 Michael Fred Phelps, II M 15.0 193.0 91.0 United States USA ... Summer Sydney Swimming Swimming Men's 200 metres Butterfly NaN USA NaN 0 0 0
1 Michael Fred Phelps, II 28 94406 Michael Fred Phelps, II M 19.0 193.0 91.0 United States USA ... Summer Athina Swimming Swimming Men's 200 metres Freestyle Bronze USA NaN 1 0 0
2 Michael Fred Phelps, II 28 94406 Michael Fred Phelps, II M 19.0 193.0 91.0 United States USA ... Summer Athina Swimming Swimming Men's 4 x 100 metres Freestyle Relay Bronze USA NaN 1 0 0
3 Michael Fred Phelps, II 28 94406 Michael Fred Phelps, II M 19.0 193.0 91.0 United States USA ... Summer Athina Swimming Swimming Men's 4 x 200 metres Freestyle Relay Gold USA NaN 0 1 0
4 Michael Fred Phelps, II 28 94406 Michael Fred Phelps, II M 19.0 193.0 91.0 United States USA ... Summer Athina Swimming Swimming Men's 100 metres Butterfly Gold USA NaN 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
63088 Preben Isaksson 1 52433 Preben Isaksson M 21.0 183.0 73.0 Denmark DEN ... Summer Tokyo Cycling Cycling Men's Individual Pursuit, 4,000 metres Bronze Denmark NaN 1 0 0
63089 Preben Isaksson 1 52433 Preben Isaksson M 21.0 183.0 73.0 Denmark DEN ... Summer Tokyo Cycling Cycling Men's Team Pursuit, 4,000 metres NaN Denmark NaN 0 0 0
63090 rpd Lengyel 1 68707 rpd Lengyel M 20.0 NaN NaN Hungary HUN ... Summer Berlin Swimming Swimming Men's 400 metres Freestyle NaN Hungary NaN 0 0 0
63091 rpd Lengyel 1 68707 rpd Lengyel M 20.0 NaN NaN Hungary HUN ... Summer Berlin Swimming Swimming Men's 4 x 200 metres Freestyle Relay Bronze Hungary NaN 1 0 0
63092 rpd Lengyel 1 68707 rpd Lengyel M 20.0 NaN NaN Hungary HUN ... Summer Berlin Swimming Swimming Men's 100 metres Backstroke NaN Hungary NaN 0 0 0

63093 rows × 22 columns

In [65]:
sns.heatmap(df.pivot_table(index='Sport',column='Year',values='Event',aggfunc='count')).fillna(0).astype('int'),annot=True)
  Input In [65]
    sns.heatmap(df.pivot_table(index='Sport',column='Year',values='Event',aggfunc='count')).fillna(0).astype('int'),annot=True)
    ^
SyntaxError: cannot assign to function call
In [ ]:
x =df.drop_duplicates(subset=['Year','Sport','Event'])
In [ ]:
x
In [ ]:
x.pivot_table(index='Sport',columns='Year',values = 'Event',aggfunc='count').fillna(0).astype('int')
In [ ]:
import seaborn as sns
plt.figure(figsize=(25,25))
sns.heatmap(x.pivot_table(index='Sport',columns='Year',values = 'Event',aggfunc='count').fillna(0).astype('int'),annot=True)
In [ ]:
#left_on: Columns from the left DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame

#right_on: Columns from the right DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame

#.reset_index() converts it into a dataframe and dataframes can be merged using pandas
In [ ]:
def most_successful(df,sport):
    temp_df = df.dropna(subset=['Medal']) #filter those players with Nan/zero medals
    
    if(sport != 'Overall'):
        temp_df = temp_df[temp_df['Sport']==sport] #shows only names with the given sport
        
    x = temp_df['Name'].value_counts().reset_index().head(15).merge(df,left_on='index',right_on='Name',how='left')[['index','Name_x','Sport','region']].drop_duplicates('index')
    #means here df is right table and temp_df is left table and for merging take index of temp_df and Name of df
    x.rename(columns={'index':'Name','Name_x':'Number of Medals'},inplace=True)
    return x
In [ ]:
most_successful(df,'Overall')
In [ ]:
#CountryWise
#Countrywise medal tally per year line plot
#WHat countries are good at heatmap
#Most Successful Athletes Top 10
In [ ]:
df
In [ ]:
temp_df = df.dropna(subset=['Medal'])
temp_df.drop_duplicates(subset=['Team','Games','Medal','region','NOC','Event','Sport'],inplace=True)
In [ ]:
new_df = temp_df[temp_df['region']=='India']
In [ ]:
final_df = new_df.groupby('Year').count()['Medal'].reset_index()
In [ ]:
final_df
In [ ]:
fig = px.line(final_df,x='Year',y='Medal')
fig.show()
In [ ]:
temp_df = df.dropna(subset=['Medal'])
temp_df.drop_duplicates(subset=['Team','Games','Medal','region','NOC','Event','Sport'],inplace=True)
new_df = temp_df[temp_df['region']=='India']
In [ ]:
plt.figure(figsize=(20,20))
sns.heatmap(new_df.pivot_table(index='Sport',columns = 'Year', values = 'Medal' , aggfunc = 'count').fillna(0),annot = True)
In [ ]:
def most_successful_countrywise(df,country):
    temp_df = df.dropna(subset=['Medal']) #filter those players with Nan/zero medals
    
    temp_df = temp_df[temp_df['region']==country] #shows only names with the given sport
        
    x = temp_df['Name'].value_counts().reset_index().head(10).merge(df,left_on='index',right_on='Name',how='left')[['index','Name_x','Sport','region']].drop_duplicates('index')
    #means here df is right table and temp_df is left table and for merging take index of temp_df and Name of df
    #here name value count means this name or athlete has got medals how many times means how many medals he has got that much successful he is
    x.rename(columns={'index':'Name','Name_x':'Number of Medals'},inplace=True)
    return x
In [ ]:
most_successful_countrywise(df,'USA')
In [ ]:
#Athlete Analysis
import plotly.figure_factory as ff
In [ ]:
athlete_df = df.drop_duplicates(subset = ['Name','region'])
In [ ]:
x1 = athlete_df['Age'].dropna()
x2=athlete_df[athlete_df['Medal']=='Gold']['Age'].dropna()
x3=athlete_df[athlete_df['Medal']=='Silver']['Age'].dropna()
x4=athlete_df[athlete_df['Medal']=='Bronze']['Age'].dropna()
In [ ]:
fig=ff.create_distplot([x1,x2,x3,x4],['Overall Age ','Gold Medalist','Silver Medalist','Bronze Medalist'])
fig.show()
In [ ]:
fig=ff.create_distplot([x1,x2,x3,x4],['Overall Age ','Gold Medalist','Silver Medalist','Bronze Medalist'],show_hist=False,show_rug=False)
fig.show()
In [ ]:
famous_sports = ['Basketball'
,             'Judo'
,           'Football'
,         'Tug-Of-War'
,          'Athletics'
                 ,  'Hockey'
,      'Rowing'
,       'Golf',   
                 'Sailing','Polo','Shooting'
]
In [ ]:
df['Sport']
In [ ]:
x=[]
name=[]
for i in famous_sports:
    temp_df = athlete_df[athlete_df['Sport']==i]
    x.append(temp_df[temp_df['Medal']=='Gold']['Age'].dropna())
    name.append(i)
In [ ]:
fig = ff.create_distplot(x,name,show_hist=False,show_rug=False)
fig.show()
In [ ]:
athlete_df
In [ ]:
athlete_df['Medal'].fillna('No Medal',inplace = True)
plt.figure(figsize=(10,10))#used to increase size of the plot in next line
temp_df = athlete_df[athlete_df['Sport']=='Athletics']
sns.scatterplot(temp_df['Height'],temp_df['Weight'],hue=temp_df['Medal'],style=temp_df['Sex'],s=100)
#style means style of point and s means size of each marker/point
In [ ]:
men = athlete_df[athlete_df['Sex']=='M'].groupby('Year').count()#['Name'].reset_index()
women = athlete_df[athlete_df['Sex']=='F'].groupby('Year').count()['Name'].reset_index()
In [ ]:
men
In [ ]:
men.head()
In [ ]:
men = athlete_df[athlete_df['Sex']=='M'].groupby('Year').count()['Name'].reset_index()
women = athlete_df[athlete_df['Sex']=='F'].groupby('Year').count()['Name'].reset_index()
In [ ]:
men
In [ ]:
final = men.merge(women,on='Year',how='left')
final.rename(columns={'Name_x':'Male','Name_y':'Female'},inplace = True)
final=final.fillna(0)
In [ ]:
final
In [ ]:
fig = px.line(final,x='Year',y=['Male','Female'])
fig.show()
In [ ]: